import pandas as pd
import numpy as np
#read dataset as df
df = pd.read_csv("NCES_CCD_extract.csv")
column_names = df.columns.tolist()
print(column_names)
['Agency Name', 'State Name [District] Latest available year', 'State Abbr [District] Latest available year', 'Agency ID - NCES Assigned [District] Latest available year', 'Hispanic Students [District] 2019-20', 'Hispanic Students [District] 2018-19', 'Hispanic Students [District] 2017-18', 'Hispanic Students [District] 2016-17', 'Hispanic Students [District] 2015-16', 'Hispanic Students [District] 2014-15', 'Black or African American Students [District] 2019-20', 'Black or African American Students [District] 2018-19', 'Black or African American Students [District] 2017-18', 'Black or African American Students [District] 2016-17', 'Black or African American Students [District] 2015-16', 'Black or African American Students [District] 2014-15', 'White Students [District] 2019-20', 'White Students [District] 2018-19', 'White Students [District] 2017-18', 'White Students [District] 2016-17', 'White Students [District] 2015-16', 'White Students [District] 2014-15', 'Fall Membership (V33) [District Finance] 2017-18', 'Fall Membership (V33) [District Finance] 2016-17', 'Fall Membership (V33) [District Finance] 2015-16', 'Fall Membership (V33) [District Finance] 2014-15', 'Total General Revenue (TOTALREV) [District Finance] 2017-18', 'Total General Revenue (TOTALREV) [District Finance] 2016-17', 'Total General Revenue (TOTALREV) [District Finance] 2015-16', 'Total General Revenue (TOTALREV) [District Finance] 2014-15', 'Total Expenditures (TOTALEXP) [District Finance] 2017-18', 'Total Expenditures (TOTALEXP) [District Finance] 2016-17', 'Total Expenditures (TOTALEXP) [District Finance] 2015-16', 'Total Expenditures (TOTALEXP) [District Finance] 2014-15']
# Rename for easy use
df1 = df.rename(columns={'Agency Name':'distname',\
'State Name [District] Latest available year':'stname', 'State Abbr [District] Latest available year': 'stabbr', \
'Agency ID - NCES Assigned [District] Latest available year':'distid', 'Hispanic Students [District] 2019-20':'hisp2019', \
'Hispanic Students [District] 2018-19':'hisp2018','Hispanic Students [District] 2017-18':'hisp2017', \
'Hispanic Students [District] 2016-17':'hisp2016', 'Hispanic Students [District] 2015-16':'hisp2015', \
'Hispanic Students [District] 2014-15':'hisp2014', 'Black or African American Students [District] 2019-20':'black2019', \
'Black or African American Students [District] 2018-19':'black2018', 'Black or African American Students [District] 2017-18':'black2017',\
'Black or African American Students [District] 2016-17':'black2016', 'Black or African American Students [District] 2015-16':'black2015',
'Black or African American Students [District] 2014-15':'black2014', 'White Students [District] 2019-20':'white2019', \
'White Students [District] 2018-19':'white2018', 'White Students [District] 2017-18':'white2017', \
'White Students [District] 2016-17':'white2016', 'White Students [District] 2015-16':'white2015', \
'White Students [District] 2014-15':'white2014', 'Fall Membership (V33) [District Finance] 2017-18': 'mem2017', \
'Fall Membership (V33) [District Finance] 2016-17': 'mem2016', 'Fall Membership (V33) [District Finance] 2015-16': 'mem2015', \
'Fall Membership (V33) [District Finance] 2014-15': 'mem2014', 'Total General Revenue (TOTALREV) [District Finance] 2017-18':'totrev2017',\
'Total General Revenue (TOTALREV) [District Finance] 2016-17':'totrev2016', 'Total General Revenue (TOTALREV) [District Finance] 2015-16':'totrev2015', \
'Total General Revenue (TOTALREV) [District Finance] 2014-15':'totrev2014', 'Total Expenditures (TOTALEXP) [District Finance] 2017-18':'totexp2017', \
'Total Expenditures (TOTALEXP) [District Finance] 2016-17':'totexp2016', 'Total Expenditures (TOTALEXP) [District Finance] 2015-16':'totexp2015', \
'Total Expenditures (TOTALEXP) [District Finance] 2014-15':'totexp2014'})
df1_column_names = df1.columns.tolist()
print(df1_column_names)
['distname', 'stname', 'stabbr', 'distid', 'hisp2019', 'hisp2018', 'hisp2017', 'hisp2016', 'hisp2015', 'hisp2014', 'black2019', 'black2018', 'black2017', 'black2016', 'black2015', 'black2014', 'white2019', 'white2018', 'white2017', 'white2016', 'white2015', 'white2014', 'mem2017', 'mem2016', 'mem2015', 'mem2014', 'totrev2017', 'totrev2016', 'totrev2015', 'totrev2014', 'totexp2017', 'totexp2016', 'totexp2015', 'totexp2014']
# Destring the variables that should be numeric
numeric_columns = ['distid','hisp2019', 'hisp2018', 'hisp2017', 'hisp2016', 'hisp2015', 'hisp2014',
'black2019', 'black2018', 'black2017', 'black2016', 'black2015', 'black2014',
'white2019', 'white2018', 'white2017', 'white2016', 'white2015', 'white2014',
'mem2017', 'mem2016', 'mem2015', 'mem2014',
'totrev2017', 'totrev2016', 'totrev2015', 'totrev2014',
'totexp2017', 'totexp2016', 'totexp2015', 'totexp2014']
# copy the dataframe to df2 for future convenience
df2 = df1.copy()
df2[numeric_columns] = df2[numeric_columns].apply(pd.to_numeric, errors='coerce')
missing_data = df2[numeric_columns].isna()
df3 = df2.dropna(subset=numeric_columns)
df4 = df3.melt(id_vars=['distid', 'distname', 'stname', 'stabbr'], var_name='name_year')
df4[['name', 'year']] = df4['name_year'].str.extract(r'([A-Za-z]+)([0-9]+)')
df4 = df4.pivot_table(index=['distid', 'distname', 'stname', 'stabbr', 'year'], columns='name', values='value', aggfunc='sum').reset_index()
def describe_data(data):
num_observations, num_variables = data.shape
variable_types = data.dtypes
print(f"Observations: {num_observations}")
print(f"Variables: {num_variables}")
print("\nVariable Storage Display")
print(f"name\ttype\tformat")
for variable_name, variable_type in variable_types.items():
variable_format = ''
if variable_type == 'int64':
variable_format = '%10.0g'
elif variable_type == 'float64':
variable_format = '%10.0g'
elif variable_type == 'object':
variable_format = f'%{max(len(variable_name), 9)}s'
print(f"{variable_name}\t{variable_type}\t{variable_format}")
describe_data(df4)
Observations: 59370 Variables: 11 Variable Storage Display name type format distid float64 %10.0g distname object %9s stname object %9s stabbr object %9s year object %9s black float64 %10.0g hisp float64 %10.0g mem float64 %10.0g totexp float64 %10.0g totrev float64 %10.0g white float64 %10.0g
# Question 2 - Per-pupil revenue and per-pupil expenditures
df5 = df4.copy()
df5['pupil_per_rev'] = df4['totrev'] / df4['mem']
df5['pupil_per_exp'] = df4['totexp'] / df4['mem']
df6 = df5.copy()
df6['pupil_per_rev_log'] = np.log1p(df5['pupil_per_rev'])
df6['pupil_per_exp_log'] = np.log1p(df5['pupil_per_exp'])
df7 = df6.copy()
df7['black_hisp_total'] = df7['black'] + df7['hisp']
df7['black_hisp_share'] = df7['black_hisp_total'] / df7['mem']
df8 = df7.copy()
df8['totrev_log'] = np.log1p(df8['totrev'])
df8['totexp_log'] = np.log1p(df8['totexp'])
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 5))
plt.scatter(df8['black_hisp_share'], df8['totrev_log'], alpha=0.5)
plt.title('Correlation between Black/Hispanic Enrollment Share and Log of Total Revenues')
plt.xlabel('Black/Hispanic Enrollment Share')
plt.ylabel('Log of Total Revenues')
plt.grid(True)
plt.show()
plt.figure(figsize=(10, 5))
plt.scatter(df8['black_hisp_share'], df8['totexp_log'], alpha=0.5)
plt.title('Correlation between Black/Hispanic Enrollment Share and Log of Total Expenditures')
plt.xlabel('Black/Hispanic Enrollment Share')
plt.ylabel('Log of Total Expenditures')
plt.grid(True)
plt.show()
In a single figure, plot both the average log of total per pupil revenue and log of total per pupil expenditures against the share of Black/Hispanic student quantiles.
# Question 4
df9=df8.copy()
df9['quantile'] = pd.qcut(df9['black_hisp_share'], q=100, labels=False)
quantile_stats = df9.groupby('quantile').agg({
'totrev_log': 'mean',
'totexp_log': 'mean'
}).reset_index()
import matplotlib.pyplot as plt
quantile_stats_array = quantile_stats.to_numpy()
plt.figure(figsize=(10, 5))
plt.plot(quantile_stats_array[:, 0], quantile_stats_array[:, 1], label='Avg Log Total Per Pupil Revenue')
plt.plot(quantile_stats_array[:, 0], quantile_stats_array[:, 2], label='Avg Log Total Per Pupil Expenditures')
plt.title('Average Log Total Per Pupil Revenue and Expenditures by Black/Hispanic Student Quantiles')
plt.xlabel('Quantiles of Black/Hispanic Student Share')
plt.ylabel('Average Log Value')
plt.legend()
plt.grid(True)
plt.show()
per pupil expenditures against the share of Black/Hispanic student quantiles. What patterns emerge?
# Group the data by state (stabbr)
state_groups = df9.groupby('stabbr')
# Calculate the average Black/Hispanic share and per-pupil expenditures for each state
state_summary = state_groups.agg({
'black': 'mean',
'hisp': 'mean',
'black_hisp_share':'mean',
'pupil_per_exp': 'mean'
}).reset_index()
state_summary.head()
| name | stabbr | black | hisp | black_hisp_share | pupil_per_exp |
|---|---|---|---|---|---|
| 0 | DC | 1832.826923 | 493.448718 | 0.758782 | 26269.471453 |
| 1 | DE | 1087.900901 | 617.981982 | 0.469254 | 15404.021295 |
| 2 | FL | 9283.910448 | 13691.378109 | 0.382428 | 10569.113415 |
| 3 | GA | 3497.125683 | 1473.353370 | 0.431537 | 11374.751516 |
| 4 | HI | 3189.666667 | 24591.666667 | 0.143884 | 15281.662068 |
# Rename the columns if needed
state_summary.rename(columns={'black_hisp_share': 'Avg_black_hist_Share', 'pupil_per_exp': 'Avg_Per_Pupil_Exp'}, inplace=True)
# Plot the scatter plot with the x-axis as the combined avg Black/Hispanic share and the y-axis as the average per-pupil expenditures
plt.scatter(state_summary['Avg_black_hist_Share'], state_summary['Avg_Per_Pupil_Exp'], alpha=0.5)
# Add labels with two-letter state abbreviations
for i, txt in enumerate(state_summary['stabbr']):
plt.annotate(txt, (state_summary['Avg_black_hist_Share'].iloc[i], state_summary['Avg_Per_Pupil_Exp'].iloc[i]))
# Add axis labels and a title
plt.xlabel('Average Black/Hispanic Enrollment Share')
plt.ylabel('Average Per-Pupil Expenditures')
plt.title('Relationship between State-Average Student Expenditures and Combined Black/Hispanic Enrollment Share')
# Display the plot
plt.grid(True)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import contextily as ctx
import geopandas as gpd
import os
from mpl_toolkits.axes_grid1 import make_axes_locatable
# Get stats
# Retrive the official shapefiles maps from the US Census Bureau
# link: https://www.census.gov/cgi-bin/geo/shapefiles/index.php
path = "tl_2022_us_state/tl_2022_us_state.shp"
gdf = gpd.read_file(path)
gdf = gdf.to_crs("EPSG:4326")
gdf.head(1)
| REGION | DIVISION | STATEFP | STATENS | GEOID | STUSPS | NAME | LSAD | MTFCC | FUNCSTAT | ALAND | AWATER | INTPTLAT | INTPTLON | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3 | 5 | 54 | 01779805 | 54 | WV | West Virginia | 00 | G4000 | A | 62266456923 | 489045863 | +38.6472854 | -080.6183274 | POLYGON ((-77.75438 39.33346, -77.75422 39.333... |
# basic plot
gdf.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7fd20f26e280>
# removal of non_continetal states for better visualization
non_continental = ['HI','VI','MP','GU','AK','AS','PR']
us49 = gdf
for n in non_continental:
us49 = us49[us49.STUSPS != n]
# Simple map for US States based on the downloaded file
f,ax = plt.subplots(1,1, figsize=(8,6), sharex=True, sharey=True, dpi=300)
plt.title('Simple Map of US States')
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="3%",pad=0,alpha=0.5)
us49.plot('ALAND', ax=ax, alpha=0.5, cmap='Pastel1', edgecolor='k', legend=True, cax=cax, linewidth=0.1)
plt.show()
# Starting combine the data we have processed with the States data with geo information
# Checks for state_summary(done in previous questions)
state_summary.head(3)
| stabbr | black | hisp | Avg_black_hist_Share | Avg_Per_Pupil_Exp | |
|---|---|---|---|---|---|
| 0 | DC | 1832.826923 | 493.448718 | 0.758782 | 26269.471453 |
| 1 | DE | 1087.900901 | 617.981982 | 0.469254 | 15404.021295 |
| 2 | FL | 9283.910448 | 13691.378109 | 0.382428 | 10569.113415 |
# Combine the data with geo information
import geopandas as gpd
import matplotlib.pyplot as plt
my_data = state_summary[['stabbr', 'Avg_black_hist_Share']]
merged = us49.merge(my_data, left_on='STUSPS', right_on='stabbr', how='inner')
# Filter out non-continental states
non_continental = ['HI', 'VI', 'MP', 'GU', 'AK', 'AS', 'PR']
merged = merged[~merged['STUSPS'].isin(non_continental)]
# Handle missing values in the specified column (e.g., 'Avg_black_hist_Share')
# Replace NaN values with a default value (e.g., 0)
column_to_plot = 'Avg_black_hist_Share'
default_value = 0
merged[column_to_plot].fillna(default_value, inplace=True) # Replace NaN with the default value
plt.show()
data_types = my_data.dtypes
print(data_types)
stabbr object Avg_black_hist_Share float64 dtype: object
# Remove leading/trailing whitespaces and convert to uppercase
us49['STUSPS'] = us49['STUSPS'].str.strip().str.upper()
my_data['stabbr'] = my_data['stabbr'].str.strip().str.upper()
# Perform an inner join
df_join = us49.merge(my_data, left_on='STUSPS', right_on='stabbr', how='inner')
df_join.head(3)
<ipython-input-37-ffcc16cb307c>:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy my_data['stabbr'] = my_data['stabbr'].str.strip().str.upper()
| REGION | DIVISION | STATEFP | STATENS | GEOID | STUSPS | NAME | LSAD | MTFCC | FUNCSTAT | ALAND | AWATER | INTPTLAT | INTPTLON | geometry | stabbr | Avg_black_hist_Share | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3 | 5 | 54 | 01779805 | 54 | WV | West Virginia | 00 | G4000 | A | 62266456923 | 489045863 | +38.6472854 | -080.6183274 | POLYGON ((-77.75438 39.33346, -77.75422 39.333... | WV | 0.036465 |
| 1 | 3 | 5 | 12 | 00294478 | 12 | FL | Florida | 00 | G4000 | A | 138962819934 | 45971472526 | +28.3989775 | -082.5143005 | MULTIPOLYGON (((-83.10874 24.62949, -83.10711 ... | FL | 0.382428 |
| 2 | 2 | 3 | 17 | 01779784 | 17 | IL | Illinois | 00 | G4000 | A | 143778515726 | 6216539665 | +40.1028754 | -089.1526108 | POLYGON ((-87.89243 38.28285, -87.89334 38.282... | IL | 0.248599 |
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
def StatesPlot(df, data, cmap):
fig, ax = plt.subplots(1, 1, figsize=(15, 10), sharex=True, sharey=True, dpi=300)
fig.tight_layout()
plt.title('United States Map - Variable = ' + data)
ax.set_axis_off()
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="3%", pad=0.5, alpha=0.5)
# Plot the data using the specified colormap
df.plot(data, ax=ax, alpha=0.5, cmap=cmap, edgecolor='k', legend=True, cax=cax, linewidth=0.1)
# Customize the colorbar label and appearance
cax.set_title(data, fontsize=12)
plt.show()
# Example usage with df_join:
StatesPlot(df_join, 'Avg_black_hist_Share', 'YlOrRd')
# Specify the name that is used to seach for the data
import geopandas as gpd
import matplotlib.pyplot as plt
shapefile_path = 'Green_Cart_Bnd_2008_DOHMH_2010/Green_Cart_Bnd_2008_DOHMH_2010.shp'
gdf_states_ny = gpd.read_file(shapefile_path)
# gdf_states_ny.head()
gdf_states_ny.plot()
plt.title('Map of Your Dataset')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()
gdf_states_ny.head()
| BOROCODE | COUNT_PREC | FIRST_BORO | FIRST_BO_1 | INCL_PREC | geometry | |
|---|---|---|---|---|---|---|
| 0 | 1 | 8 | 1 | Manhattan | 23, 25, 26, 28, 30, 32, 33, 34\r\n33, and 34. | MULTIPOLYGON (((1007642.105 227756.197, 100742... |
| 1 | 2 | 10 | 2 | The Bronx | 40, 41, 42, 44, 46, 47, 48, 49, 52 | MULTIPOLYGON (((1012821.806 229228.265, 101278... |
| 2 | 3 | 9 | 3 | Brooklyn | 67, 70, 71, 73, 75, 77, 81, 83 | MULTIPOLYGON (((1027452.041 156400.814, 102683... |
| 3 | 4 | 4 | 4 | Queens | 100, 101, 103, 113 | MULTIPOLYGON (((1032066.245 153833.959, 103147... |
| 4 | 5 | 1 | 5 | Staten Island | 120 | MULTIPOLYGON (((968962.134 160164.304, 969308.... |
# Map of NYC
f, ax = plt.subplots(1, 1, figsize=(12, 10), sharex=True, sharey=True, dpi=300)
gdf_states_ny.plot('FIRST_BO_1', ax=ax, alpha=0.5, cmap='Pastel1', edgecolor='k', legend=True, linewidth=0.1)
plt.title('Map of NYC')
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="3%", pad=0.1)
gdf_states_ny.plot('FIRST_BO_1', ax=ax, alpha=0.5, cmap='Pastel1', edgecolor='k', cax=cax, linewidth=0.1)
plt.show()
import geopandas as gpd
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
shapefile_path = 'tl_2022_49_tabblock20/tl_2022_49_tabblock20.shp'
gdf_states_ut = gpd.read_file(shapefile_path)
f, ax = plt.subplots(1, 1, figsize=(12, 10), sharex=True, sharey=True, dpi=300)
gdf_states_ut.plot('ALAND20', ax=ax, alpha=0.5, cmap='Pastel1', edgecolor='k', legend=True, linewidth=0.1)
plt.title('Map of Utah')
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="3%", pad=0.1)
gdf_states_ut.plot('ALAND20', ax=ax, alpha=0.5, cmap='Pastel1', edgecolor='k', cax=cax, linewidth=0.1)
plt.show()
import geopandas as gpd
import osmnx as ox
import matplotlib.pyplot as plt
# Specify the name that is used to seach for the data
place_name = "Salt Lake City, Utah, USA"
# Get place boundary related to the place name as a geodataframe
area = ox.geocode_to_gdf(place_name)
# Check the data type
area
| geometry | bbox_north | bbox_south | bbox_east | bbox_west | place_id | osm_type | osm_id | lat | lon | class | type | place_rank | importance | addresstype | name | display_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | POLYGON ((-112.10139 40.82715, -112.10132 40.8... | 40.853391 | 40.699926 | -111.740484 | -112.101392 | 315499582 | relation | 198770 | 40.75962 | -111.886797 | boundary | administrative | 16 | 0.637243 | city | Salt Lake City | Salt Lake City, Salt Lake County, Utah, United... |
# List key-value pairs for tags
tags = {'building': True}
buildings = ox.geometries_from_place(place_name, tags)
buildings.head()
/var/folders/q5/_56gtjhj19s4q7xtbc4g0chc0000gn/T/ipykernel_13851/2857596332.py:4: UserWarning: The `geometries` module and `geometries_from_X` functions have been renamed the `features` module and `features_from_X` functions. Use these instead. The `geometries` module and function names are deprecated and will be removed in a future release. buildings = ox.geometries_from_place(place_name, tags)
| building | ele | gnis:Class | gnis:County | gnis:County_num | gnis:ST_alpha | gnis:ST_num | gnis:feature_id | name | geometry | ... | tower:type | passports | payment:paypal | area | service:vehicle:painting | substation | construction:aeroway | faculty | ways | type | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| element_type | osmid | |||||||||||||||||||||
| node | 150941193 | yes | 1337 | Populated Place | Salt Lake | 035 | UT | 49 | 1449753 | Tara Condominium | POINT (-111.89322 40.77525) | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 150945663 | yes | 1325 | Populated Place | Salt Lake | 035 | UT | 49 | 1449754 | Trevi Towers Condominium | POINT (-111.89311 40.77462) | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
| 150948580 | yes | 1464 | Populated Place | Salt Lake | 035 | UT | 49 | 1449779 | Northpoint Estates Condominium | POINT (-111.87954 40.78641) | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
| 150954906 | yes | 1288 | Populated Place | Salt Lake | 035 | UT | 49 | 1449799 | Villa Montaige Condominium | POINT (-111.93105 40.76022) | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
| 150954929 | residential | 1357 | Populated Place | Salt Lake | 035 | UT | 49 | 1454367 | The Avenues Heritage Condominium | POINT (-111.87124 40.77219) | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 334 columns
# Plot footprints
buildings.plot()
<Axes: >